import numpy as np
import pandas as pd
import iqplot
import bokeh.io
import holoviews as hv
import scipy
import warnings
import scipy.optimize
import scipy.stats as st
import tqdm
import bebi103
from scipy.special import logsumexp
hv.extension("bokeh")
bokeh.io.output_notebook()
/Users/davidjohnson/opt/anaconda3/lib/python3.8/site-packages/arviz/__init__.py:317: UserWarning: Trying to register the cmap 'cet_gray' which already exists.
register_cmap("cet_" + name, cmap=cmap)
/Users/davidjohnson/opt/anaconda3/lib/python3.8/site-packages/arviz/__init__.py:317: UserWarning: Trying to register the cmap 'cet_gray_r' which already exists.
register_cmap("cet_" + name, cmap=cmap)
# comment = '#' lets read_csv() know to skip over the lines that start with a pound sign
df = pd.read_csv('https://s3.amazonaws.com/bebi103.caltech.edu/data/gardner_mt_catastrophe_only_tubulin.csv', comment = '#')
df = df.melt()
df = df.dropna()
df.head()
| variable | value | |
|---|---|---|
| 0 | 12 uM | 25.000 |
| 1 | 12 uM | 40.000 |
| 2 | 12 uM | 40.000 |
| 3 | 12 uM | 45.429 |
| 4 | 12 uM | 50.000 |
Defined below are all of the functions we will use in our analysis. They are all inspired or directly taken from Justin Bois.
rg = np.random.default_rng()
def ecdf(x, data):
"""Give the value of an ECDF at arbitrary points x."""
y = np.arange(len(data) + 1) / len(data)
return y[np.searchsorted(np.sort(data), x, side="right")]
def draw_bs_sample(data):
"""Draw a bootstrap sample from a 1D data set."""
return rg.choice(data, size=len(data))
def log_like_iid_gamma(params, n):
"""Log likelihood for i.i.d. Gamma measurements."""
alpha, beta = params
if alpha <= 0 or beta <= 0:
return -np.inf
return np.sum(st.gamma.logpdf(n, alpha, 1/beta))
def log_like_iid_mix(params, n):
"""Log likelihood for i.i.d. Mixture model."""
b1, b2 = params
return np.sum(st.expon.logpdf(n, b1) + st.expon.logpdf(n, b2))
def mle_iid_gamma(n):
"""Perform maximum likelihood estimates for parameters for i.i.d.
Gamma measurements, parametrized by alpha, b=1/beta"""
with warnings.catch_warnings():
warnings.simplefilter("ignore")
res = scipy.optimize.minimize(
fun=lambda params, n: -log_like_iid_gamma(params, n),
x0=np.array([1, 0.0001]),
args=(n,),
method='Powell'
)
if res.success:
return res.x
else:
raise RuntimeError('Convergence failed with message', res.message)
def draw_parametric_bs_reps_mle(
mle_fun, gen_fun, data, args=(), size=1, progress_bar=False
):
"""Draw parametric bootstrap replicates of maximum likelihood estimator.
Parameters
----------
mle_fun : function
Function with call signature mle_fun(data, *args) that computes
a MLE for the parameters
gen_fun : function
Function to randomly draw a new data set out of the model
distribution parametrized by the MLE. Must have call
signature `gen_fun(*params, size)`.
data : one-dimemsional Numpy array
Array of measurements
args : tuple, default ()
Arguments to be passed to `mle_fun()`.
size : int, default 1
Number of bootstrap replicates to draw.
progress_bar : bool, default False
Whether or not to display progress bar.
Returns
-------
output : numpy array
Bootstrap replicates of MLEs.
"""
params = mle_fun(data, *args)
if progress_bar:
iterator = tqdm.tqdm(range(size))
else:
iterator = range(size)
return np.array(
[mle_fun(gen_fun(*params, size=len(data), *args)) for _ in iterator]
)
def draw_parametric_bs_reps_mle_two(
mle_fun, gen_fun, data, args=(), size=1, progress_bar=False
):
"""Draw parametric bootstrap replicates of maximum likelihood estimator.
Parameters
----------
mle_fun : function
Function with call signature mle_fun(data, *args) that computes
a MLE for the parameters
gen_fun : function
Function to randomly draw a new data set out of the model
distribution parametrized by the MLE. Must have call
signature `gen_fun(*params, size)`.
data : one-dimemsional Numpy array
Array of measurements
args : tuple, default ()
Arguments to be passed to `mle_fun()`.
size : int, default 1
Number of bootstrap replicates to draw.
progress_bar : bool, default False
Whether or not to display progress bar.
Returns
-------
output : numpy array
Bootstrap replicates of MLEs.
"""
params = mle_fun(data)
if progress_bar:
iterator = tqdm.tqdm(range(size))
else:
iterator = range(size)
return np.array(
[mle_fun(gen_fun(*params, size = len(data))) for _ in iterator]
)
def gen_gamma(alpha, beta, size):
return rg.gamma(alpha, 1/beta, size = size)
def gen_b1_b2(beta_1, beta_2, size):
return rg.exponential(1/beta_1, size = size) + rg.exponential(1/beta_2, size = size)
def b1_b2_mle(data):
b1, b2 = np.array([2 / data.mean()]*2)
return b1, b2
We then extract the catastrophe times for only the experiments with 12 uM of tubulin to compare our two models.
\begin{align} Gamma\ Distribution\ PDF\ f(t;\alpha, \beta) = \frac{1}{\Gamma(\alpha)} \frac{(\beta t)^\alpha}{t} \mathrm{e}^{-\beta t} \end{align}\begin{align} Mixture\ Model\ PDF\ f(t;\beta_1, \beta_2) = \frac{\beta_1\beta_2}{\beta_2 - \beta_1}\left(\mathrm{e}^{-\beta_1 t} - \mathrm{e}^{-\beta_2 t}\right). \end{align}vals_12 = df.loc[df['variable'] == '12 uM', 'value'].values
We first obtain the MLEs for α and β parameters of the gamma model. We also compute the 95% confidence interval for the parameters.
bs_reps_parametric = draw_parametric_bs_reps_mle(
mle_iid_gamma,
gen_gamma,
vals_12,
args=(),
size=10000,
progress_bar=True,
)
conf_int = np.percentile(bs_reps_parametric, [2.5, 97.5], axis=0)
print('MLEs for the Gamma model')
print('95% confidence interval for α: ' + str(conf_int[0][0]) + ' — ' + str(conf_int[1][0]))
print('95% confidence interval for β: ' + str(conf_int[0][1]) + ' — ' + str(conf_int[1][1]))
100%|████████████████████████████████████| 10000/10000 [00:21<00:00, 472.52it/s]
MLEs for the Gamma model 95% confidence interval for α: 3.587928961545842 — 3.587928961545842 95% confidence interval for β: 0.0759280927580305 — 0.3394935543278904
Then, we obtain the MLEs for both βs in the mixture model with 95% confidence intervals.
bs_reps_parametric_two = draw_parametric_bs_reps_mle_two(
b1_b2_mle,
gen_b1_b2,
vals_12,
args=(),
size=10000,
progress_bar=True,
)
conf_int_2 = np.percentile(bs_reps_parametric_two, [2.5, 97.5], axis=0)
print('MLEs for the Mixture model')
print('95% confidence interval for α: ' + str(conf_int_2[0][0]) + ' — ' + str(conf_int_2[1][0]))
print('95% confidence interval for β: ' + str(conf_int_2[0][1]) + ' — ' + str(conf_int_2[1][1]))
100%|██████████████████████████████████| 10000/10000 [00:00<00:00, 40981.93it/s]
MLEs for the Mixture model 95% confidence interval for α: 0.00499295870190454 — 0.005532777926715548 95% confidence interval for β: 0.00499295870190454 — 0.005532777926715548
We then used the MLEs for both sets of parameters to generate predictive ecdfs to compare the models to the actual data and against each other.
size = len(vals_12)
alpha, beta = mle_iid_gamma(vals_12)
b_1, b_2 = b1_b2_mle(vals_12)
mix_samples = np.array(
[gen_b1_b2(b_1, b_2, size=size) for _ in tqdm.tqdm(range(100000))]
)
gamma_samples = np.array(
[gen_gamma(alpha, beta, size=size) for _ in tqdm.tqdm(range(100000))]
)
gamma_pecdf = bebi103.viz.predictive_ecdf(
samples=gamma_samples, data=vals_12, discrete=True, x_axis_label="n", title = 'Gamma'
)
mix_pecdf = bebi103.viz.predictive_ecdf(
samples=mix_samples, data=vals_12, discrete=True, x_axis_label="n", title = 'Mixture'
)
gamma_diff = bebi103.viz.predictive_ecdf(
samples=gamma_samples, data=vals_12, diff='ecdf', discrete=True, x_axis_label="n", title = 'Gamma'
)
mix_diff = bebi103.viz.predictive_ecdf(
samples=mix_samples, data=vals_12, diff='ecdf', discrete=True, x_axis_label="n", title = 'Mixture'
)
bokeh.io.show(bokeh.layouts.gridplot([gamma_pecdf, mix_pecdf, gamma_diff, mix_diff], ncols = 2))
100%|████████████████████████████████| 100000/100000 [00:01<00:00, 53744.37it/s] 100%|████████████████████████████████| 100000/100000 [00:02<00:00, 43002.47it/s]
From the plots above, while the mixture model does not appear to be modeling the actual phenomenon "perfectly", it is certainly doing a better job than the gamma model. From this graphical model assement, we decided to move forward with the mixture model over the gamma model. We then compute the MLEs for each concentration of tubulin. From the MLEs, it seems that the experiments with lower amounts of tubulin had shorter lengths of time between the catastrophes.
df_mle = pd.DataFrame(index=['beta_1_mix', 'beta_2_mix'])
for x in df['variable'].unique():
n = df.loc[df['variable'] == x, 'value'].values
# Mixture model MLE
b_1, b_2 = b1_b2_mle(n)
# Store results in data frame
df_mle[x] = [b_1, b_2]
df_mle
| 12 uM | 7 uM | 9 uM | 10 uM | 14 uM | |
|---|---|---|---|---|---|
| beta_1_mix | 0.005255 | 0.006179 | 0.006552 | 0.005625 | 0.004269 |
| beta_2_mix | 0.005255 | 0.006179 | 0.006552 | 0.005625 | 0.004269 |